Libraries required for this analysis

knitr::opts_chunk$set(fig.align="center") 
library(rstanarm)
library(tidyverse)
library(tidybayes)
library(modelr) 
library(ggplot2)
library(magrittr)  
library(emmeans)
library(bayesplot)
library(brms)
library(gganimate)

theme_set(theme_light())

In our experiement, we used a visualization recommendation algorithm (composed of one search algorithm and one oracle algorithm) to generate visualizations for the user on one of two datasets. We then measured the user’s accuracy on two tasks: Find Extremum and Retrieve Value.

Given a search algorithm (bsf or dfs), an oracle (compassql or dziban), and a dataset (birdstrikes or movies), we would like to predict a user’s chance of answering the Find Extremum task and the Retrieve Value tasks correctly. In addition, we would like to know if the choice of search algorithm and oracle has any meaninful impact on a user’s accuracy for these two tasks.

Read in and clean data

accuracy_data = read.csv('processed_accuracy_split.csv')
accuracy_data$oracle = as.factor(accuracy_data$oracle)
accuracy_data$search = as.factor(accuracy_data$search)
accuracy_data$dataset = as.factor(accuracy_data$dataset)

models <- list()

draw_data <- list()

search_differences <- list()
oracle_differences <- list()

seed = 12

Find Extremum: Building a Model for Accuracy Analysis

data_find_extremum <- subset(accuracy_data, task == "1. Find Extremum")
models$find_extremum <- brm(accuracy ~ oracle*search*dataset, 
                    data = data_find_extremum,
                    prior = c(prior(normal(1, .05), class = Intercept)),
                    family = bernoulli(link = "logit"),
                    warmup = 500, 
                    iter = 3000, 
                    chains = 2, 
                    cores=2,
                    seed=seed,
                    file = "acc_find_extremum"
                    )
## Compiling Stan program...
## Trying to compile a simple C file
## Running /Library/Frameworks/R.framework/Resources/bin/R CMD SHLIB foo.c
## clang -mmacosx-version-min=10.13 -I"/Library/Frameworks/R.framework/Resources/include" -DNDEBUG   -I"/Library/Frameworks/R.framework/Versions/4.0/Resources/library/Rcpp/include/"  -I"/Library/Frameworks/R.framework/Versions/4.0/Resources/library/RcppEigen/include/"  -I"/Library/Frameworks/R.framework/Versions/4.0/Resources/library/RcppEigen/include/unsupported"  -I"/Library/Frameworks/R.framework/Versions/4.0/Resources/library/BH/include" -I"/Library/Frameworks/R.framework/Versions/4.0/Resources/library/StanHeaders/include/src/"  -I"/Library/Frameworks/R.framework/Versions/4.0/Resources/library/StanHeaders/include/"  -I"/Library/Frameworks/R.framework/Versions/4.0/Resources/library/RcppParallel/include/"  -I"/Library/Frameworks/R.framework/Versions/4.0/Resources/library/rstan/include" -DEIGEN_NO_DEBUG  -DBOOST_DISABLE_ASSERTS  -DBOOST_PENDING_INTEGER_LOG2_HPP  -DSTAN_THREADS  -DBOOST_NO_AUTO_PTR  -include '/Library/Frameworks/R.framework/Versions/4.0/Resources/library/StanHeaders/include/stan/math/prim/mat/fun/Eigen.hpp'  -D_REENTRANT -DRCPP_PARALLEL_USE_TBB=1   -I/usr/local/include   -fPIC  -Wall -g -O2  -c foo.c -o foo.o
## In file included from <built-in>:1:
## In file included from /Library/Frameworks/R.framework/Versions/4.0/Resources/library/StanHeaders/include/stan/math/prim/mat/fun/Eigen.hpp:13:
## In file included from /Library/Frameworks/R.framework/Versions/4.0/Resources/library/RcppEigen/include/Eigen/Dense:1:
## In file included from /Library/Frameworks/R.framework/Versions/4.0/Resources/library/RcppEigen/include/Eigen/Core:88:
## /Library/Frameworks/R.framework/Versions/4.0/Resources/library/RcppEigen/include/Eigen/src/Core/util/Macros.h:613:1: error: unknown type name 'namespace'
## namespace Eigen {
## ^
## /Library/Frameworks/R.framework/Versions/4.0/Resources/library/RcppEigen/include/Eigen/src/Core/util/Macros.h:613:16: error: expected ';' after top level declarator
## namespace Eigen {
##                ^
##                ;
## In file included from <built-in>:1:
## In file included from /Library/Frameworks/R.framework/Versions/4.0/Resources/library/StanHeaders/include/stan/math/prim/mat/fun/Eigen.hpp:13:
## In file included from /Library/Frameworks/R.framework/Versions/4.0/Resources/library/RcppEigen/include/Eigen/Dense:1:
## /Library/Frameworks/R.framework/Versions/4.0/Resources/library/RcppEigen/include/Eigen/Core:96:10: fatal error: 'complex' file not found
## #include <complex>
##          ^~~~~~~~~
## 3 errors generated.
## make: *** [foo.o] Error 1
## Start sampling

Find Extremum: Diagnostics + Model Evaluation

In the summary table, we want to see Rhat values close to 1.0 and Bulk_ESS in the thousands.

summary(models$find_extremum)
##  Family: bernoulli 
##   Links: mu = logit 
## Formula: accuracy ~ oracle * search * dataset 
##    Data: data_find_extremum (Number of observations: 59) 
## Samples: 2 chains, each with iter = 3000; warmup = 500; thin = 1;
##          total post-warmup samples = 5000
## 
## Population-Level Effects: 
##                                      Estimate Est.Error l-95% CI u-95% CI Rhat
## Intercept                                0.66      0.80    -0.83     2.28 1.00
## oracledziban                             0.89      1.29    -1.54     3.54 1.00
## searchdfs                                0.06      1.20    -2.26     2.54 1.00
## datasetmovies                            0.04      1.17    -2.28     2.32 1.00
## oracledziban:searchdfs                  -0.92      1.73    -4.42     2.46 1.00
## oracledziban:datasetmovies              -0.04      1.79    -3.53     3.43 1.00
## searchdfs:datasetmovies                  0.75      1.72    -2.63     4.06 1.00
## oracledziban:searchdfs:datasetmovies    -0.65      2.44    -5.36     4.18 1.00
##                                      Bulk_ESS Tail_ESS
## Intercept                                2280     2229
## oracledziban                             1994     2397
## searchdfs                                2042     2431
## datasetmovies                            2083     2590
## oracledziban:searchdfs                   1776     2302
## oracledziban:datasetmovies               1783     2837
## searchdfs:datasetmovies                  1736     2485
## oracledziban:searchdfs:datasetmovies     1626     2654
## 
## Samples were drawn using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

Trace plots help us check whether there is evidence of non-convergence for model.

plot(models$find_extremum)

In our pairs plots, we want to make sure we don’t have highly correlated parameters (highly correlated parameters means that our model has difficulty differenciating the effect of such parameters).

pairs(models$find_extremum)

A confusion matrix can be used to check our correct classification rate (a useful measure to see how well our model fits our data).

pred <- predict(models$find_extremum, type = "response")
pred <- if_else(pred[,1] > 0.5, 1, 0)
confusion_matrix <- table(pred, pull(data_find_extremum, accuracy)) 
confusion_matrix
##     
## pred  0  1
##    1  5 54

Visualization of parameter effects via draws from our model posterior. The thicker line represents the 95% credible interval, while the thinner, longer line represents the 50% credible interval.

draw_data$find_extremum <- data_find_extremum %>%
  add_fitted_draws(models$find_extremum, seed = seed, re_formula = NA) %>%
  group_by(search, oracle, dataset, .draw)

draw_data$find_extremum$task <- "1. Find Extremum"
draw_data$find_extremum$condition <- paste(draw_data$find_extremum$oracle, draw_data$find_extremum$search, sep="_")

find_extremum_plot <- draw_data$find_extremum %>% ggplot(aes(
    x = .value,
    y = condition,
    fill = dataset,
    alpha = 0.5
  )) + stat_halfeye(.width = c(.95, .5)) +
    labs(x = "Predicted Accuracy (p_correct)", y = "Oracle/Search Combination") 

find_extremum_plot

Since the credible intervals on our plot overlap, we can use mean_qi to get the numeric boundaries for the different intervals.

fit_info <-  draw_data$find_extremum %>% group_by(search, oracle, dataset) %>% mean_qi(.value, .width = c(.95, .5))
fit_info
## # A tibble: 16 x 9
## # Groups:   search, oracle [4]
##    search oracle    dataset     .value .lower .upper .width .point .interval
##    <fct>  <fct>     <fct>        <dbl>  <dbl>  <dbl>  <dbl> <chr>  <chr>    
##  1 bfs    compassql birdstrikes  0.640  0.303  0.907   0.95 mean   qi       
##  2 bfs    compassql movies       0.648  0.315  0.913   0.95 mean   qi       
##  3 bfs    dziban    birdstrikes  0.790  0.479  0.972   0.95 mean   qi       
##  4 bfs    dziban    movies       0.793  0.491  0.971   0.95 mean   qi       
##  5 dfs    compassql birdstrikes  0.651  0.309  0.919   0.95 mean   qi       
##  6 dfs    compassql movies       0.787  0.487  0.969   0.95 mean   qi       
##  7 dfs    dziban    birdstrikes  0.646  0.314  0.913   0.95 mean   qi       
##  8 dfs    dziban    movies       0.669  0.363  0.912   0.95 mean   qi       
##  9 bfs    compassql birdstrikes  0.640  0.531  0.760   0.5  mean   qi       
## 10 bfs    compassql movies       0.648  0.537  0.767   0.5  mean   qi       
## 11 bfs    dziban    birdstrikes  0.790  0.710  0.894   0.5  mean   qi       
## 12 bfs    dziban    movies       0.793  0.717  0.889   0.5  mean   qi       
## 13 dfs    compassql birdstrikes  0.651  0.541  0.772   0.5  mean   qi       
## 14 dfs    compassql movies       0.787  0.710  0.885   0.5  mean   qi       
## 15 dfs    dziban    birdstrikes  0.646  0.537  0.768   0.5  mean   qi       
## 16 dfs    dziban    movies       0.669  0.572  0.776   0.5  mean   qi
## Saving 7 x 5 in image

Find Extremum: Differences Between Conditions

Next, we want to see if there is any significant difference in accuracy between the two search algorithms (bfs and dfs) and the two oracles (dzbian and compassql).

Differences in search algorithms:

find_extremum_predictive_data  <- data_find_extremum %>%
    add_predicted_draws(models$find_extremum, seed = seed, re_formula = NA) %>%
    group_by(search, oracle, dataset, .draw)

search_differences$find_extremum <- find_extremum_predictive_data  %>%
    group_by(search, dataset, .draw) %>%
    summarize(accuracy = weighted.mean(.prediction)) %>%
    compare_levels(accuracy, by = search) %>%
    rename(difference_in_accuracy = accuracy)
## `summarise()` regrouping output by 'search', 'dataset' (override with `.groups` argument)
search_differences$find_extremum$metric = "1. Find Extremum"

search_differences$find_extremum %>%
      ggplot(aes(x = difference_in_accuracy, y = metric, fill = dataset, alpha = 0.5)) +
      xlab(paste0("Expected Difference in Accuracy (",search_differences$find_extremum[1,'search'],")")) + 
      ylab("Task")+
      stat_halfeye(.width = c(.95, .5)) +
      geom_vline(xintercept = 0, linetype = "longdash") +
      theme_minimal() +
     facet_grid(. ~ dataset)

We can double-check the boundaries of the credible intervals to be sure whether or not the interval contains zero.

search_differences$find_extremum %>% mean_qi(difference_in_accuracy, .width = c(.95, .5))
## # A tibble: 4 x 8
## # Groups:   search [1]
##   search    dataset   difference_in_accur… .lower .upper .width .point .interval
##   <chr>     <fct>                    <dbl>  <dbl>  <dbl>  <dbl> <chr>  <chr>    
## 1 dfs - bfs birdstri…             -0.0648  -0.5   0.357    0.95 mean   qi       
## 2 dfs - bfs movies                 0.00237 -0.429 0.408    0.95 mean   qi       
## 3 dfs - bfs birdstri…             -0.0648  -0.214 0.0714   0.5  mean   qi       
## 4 dfs - bfs movies                 0.00237 -0.121 0.146    0.5  mean   qi

Differences in oracle:

oracle_differences$find_extremum <- find_extremum_predictive_data  %>%
    group_by(oracle, dataset, .draw) %>%
    summarize(accuracy = weighted.mean(.prediction)) %>%
    compare_levels(accuracy, by = oracle) %>%
    rename(difference_in_accuracy = accuracy)
## `summarise()` regrouping output by 'oracle', 'dataset' (override with `.groups` argument)
oracle_differences$find_extremum$metric = "1. Find Extremum"

oracle_differences$find_extremum %>%
      ggplot(aes(x = difference_in_accuracy, y = metric, fill = dataset, alpha = 0.5)) +
      xlab(paste0("Expected Difference in Accuracy (",oracle_differences$find_extremum[1,'oracle'],")")) + 
      ylab("Task")+
      stat_halfeye(.width = c(.95, .5)) +
      geom_vline(xintercept = 0, linetype = "longdash") +
      theme_minimal() +
     facet_grid(. ~ dataset)

We can double-check the boundaries of the credible intervals to be sure whether or not the interval contains zero.

oracle_differences$find_extremum %>% mean_qi(difference_in_accuracy, .width = c(.95, .5))
## # A tibble: 4 x 8
## # Groups:   oracle [1]
##   oracle      dataset  difference_in_acc…  .lower .upper .width .point .interval
##   <chr>       <fct>                 <dbl>   <dbl>  <dbl>  <dbl> <chr>  <chr>    
## 1 dziban - c… birdstr…            0.0738  -0.357   0.5     0.95 mean   qi       
## 2 dziban - c… movies              0.00782 -0.429   0.412   0.95 mean   qi       
## 3 dziban - c… birdstr…            0.0738  -0.0714  0.214   0.5  mean   qi       
## 4 dziban - c… movies              0.00782 -0.121   0.146   0.5  mean   qi

Retrieve Value: Building a Model for Accuracy Analysis

data_retrieve_value <- subset(accuracy_data, task == "2. Retrieve Value")
models$retrieve_value <- brm(accuracy ~ oracle*search*dataset, 
                    data = data_retrieve_value,
                    prior = c(prior(normal(1, .05), class = Intercept)),
                    family = bernoulli(link = "logit"),
                    warmup = 500, 
                    iter = 3000, 
                    chains = 2, 
                    cores=2,
                    seed=seed,
                    file = "acc_retrieve_value"
                    )
## Compiling Stan program...
## Start sampling

Retrieve Value: Diagnostics + Model Evaluation

In the summary table, we want to see Rhat values close to 1.0 and Bulk_ESS in the thousands.

summary(models$retrieve_value)
##  Family: bernoulli 
##   Links: mu = logit 
## Formula: accuracy ~ oracle * search * dataset 
##    Data: data_retrieve_value (Number of observations: 59) 
## Samples: 2 chains, each with iter = 3000; warmup = 500; thin = 1;
##          total post-warmup samples = 5000
## 
## Population-Level Effects: 
##                                      Estimate Est.Error l-95% CI u-95% CI Rhat
## Intercept                                1.53      0.96    -0.18     3.60 1.00
## oracledziban                            -0.00      1.46    -2.85     2.93 1.00
## searchdfs                               -0.00      1.43    -2.85     2.82 1.00
## datasetmovies                           -1.51      1.32    -4.24     0.94 1.00
## oracledziban:searchdfs                  -0.88      1.93    -4.66     2.83 1.00
## oracledziban:datasetmovies               0.75      1.86    -2.95     4.33 1.00
## searchdfs:datasetmovies                  1.48      1.90    -2.11     5.29 1.00
## oracledziban:searchdfs:datasetmovies    -0.61      2.57    -5.55     4.31 1.00
##                                      Bulk_ESS Tail_ESS
## Intercept                                1887     1918
## oracledziban                             1812     1718
## searchdfs                                1799     1893
## datasetmovies                            1781     1754
## oracledziban:searchdfs                   1725     2026
## oracledziban:datasetmovies               1708     1988
## searchdfs:datasetmovies                  1686     2280
## oracledziban:searchdfs:datasetmovies     1685     2221
## 
## Samples were drawn using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

Trace plots help us check whether there is evidence of non-convergence for model.

plot(models$retrieve_value)

In our pairs plots, we want to make sure we don’t have highly correlated parameters (highly correlated parameters means that our model has difficulty differenciating the effect of such parameters).

pairs(models$retrieve_value)

A confusion matrix can be used to check our correct classification rate (a useful measure to see how well our model fits our data).

pred <- predict(models$retrieve_value, type = "response")
pred <- if_else(pred[,1] > 0.5, 1, 0)
confusion_matrix <- table(pred, pull(data_retrieve_value, accuracy)) 
confusion_matrix
##     
## pred  0  1
##    0  1  0
##    1  4 54

Visualization of parameter effects via draws from our model posterior. The thicker line represents the 95% credible interval, while the thinner, longer line represents the 50% credible interval.

draw_data$retrieve_value <- data_retrieve_value %>%
  add_fitted_draws(models$retrieve_value, seed = seed, re_formula = NA) %>%
  group_by(search, oracle, dataset, .draw)

draw_data$retrieve_value$task <- "2. Retrieve Value"
draw_data$retrieve_value$condition <- paste(draw_data$retrieve_value$oracle, draw_data$retrieve_value$search, sep="_")

retrieve_value_plot <- draw_data$retrieve_value %>% ggplot(aes(
    x = .value,
    y = condition,
    fill = dataset,
    alpha = 0.5
  )) + stat_halfeye(.width = c(.95, .5)) +
    labs(x = "Predicted Accuracy (p_correct)", y = "Oracle/Search Combination") 

retrieve_value_plot

Since the credible intervals on our plot overlap, we can use mean_qi to get the numeric boundaries for the different intervals.

fit_info <-  draw_data$retrieve_value %>% group_by(search, oracle, dataset) %>% mean_qi(.value, .width = c(.95, .5))
fit_info
## # A tibble: 16 x 9
## # Groups:   search, oracle [4]
##    search oracle    dataset     .value .lower .upper .width .point .interval
##    <fct>  <fct>     <fct>        <dbl>  <dbl>  <dbl>  <dbl> <chr>  <chr>    
##  1 bfs    compassql birdstrikes  0.786  0.454  0.973   0.95 mean   qi       
##  2 bfs    compassql movies       0.504  0.172  0.823   0.95 mean   qi       
##  3 bfs    dziban    birdstrikes  0.785  0.460  0.975   0.95 mean   qi       
##  4 bfs    dziban    movies       0.663  0.345  0.918   0.95 mean   qi       
##  5 dfs    compassql birdstrikes  0.787  0.471  0.971   0.95 mean   qi       
##  6 dfs    compassql movies       0.786  0.496  0.967   0.95 mean   qi       
##  7 dfs    dziban    birdstrikes  0.637  0.305  0.908   0.95 mean   qi       
##  8 dfs    dziban    movies       0.661  0.342  0.913   0.95 mean   qi       
##  9 bfs    compassql birdstrikes  0.786  0.706  0.892   0.5  mean   qi       
## 10 bfs    compassql movies       0.504  0.378  0.629   0.5  mean   qi       
## 11 bfs    dziban    birdstrikes  0.785  0.702  0.892   0.5  mean   qi       
## 12 bfs    dziban    movies       0.663  0.560  0.775   0.5  mean   qi       
## 13 dfs    compassql birdstrikes  0.787  0.706  0.894   0.5  mean   qi       
## 14 dfs    compassql movies       0.786  0.709  0.884   0.5  mean   qi       
## 15 dfs    dziban    birdstrikes  0.637  0.522  0.760   0.5  mean   qi       
## 16 dfs    dziban    movies       0.661  0.561  0.773   0.5  mean   qi
## Saving 7 x 5 in image

Retrieve Value: Differences Between Conditions

Next, we want to see if there is any significant difference between the two search algorithms (bfs and dfs) and the two oracles (dzbian and compassql).

Differences in search algorithms:

retrieve_value_predictive_data  <- data_retrieve_value %>%
    add_predicted_draws(models$retrieve_value, seed = seed, re_formula = NA) %>%
    group_by(search, oracle, dataset, .draw)

search_differences$retrieve_value <- retrieve_value_predictive_data  %>%
    group_by(search, dataset, .draw) %>%
    summarize(accuracy = weighted.mean(.prediction)) %>%
    compare_levels(accuracy, by = search) %>%
    rename(difference_in_accuracy = accuracy)
## `summarise()` regrouping output by 'search', 'dataset' (override with `.groups` argument)
search_differences$retrieve_value$metric = "2. Retrieve Value"

search_differences$retrieve_value %>%
      ggplot(aes(x = difference_in_accuracy, y = metric, fill = dataset, alpha = 0.5)) +
      xlab(paste0("Expected Difference in Accuracy (",search_differences$retrieve_value[1,'search'],")")) + 
      ylab("Task")+
      stat_halfeye(.width = c(.95, .5)) +
      geom_vline(xintercept = 0, linetype = "longdash") +
      theme_minimal() +
     facet_grid(. ~ dataset)

We can double-check the boundaries of the credible intervals to be sure whether or not the interval contains zero.

search_differences$retrieve_value %>% mean_qi(difference_in_accuracy, .width = c(.95, .5))
## # A tibble: 4 x 8
## # Groups:   search [1]
##   search   dataset    difference_in_accu…  .lower .upper .width .point .interval
##   <chr>    <fct>                    <dbl>   <dbl>  <dbl>  <dbl> <chr>  <chr>    
## 1 dfs - b… birdstrik…             -0.0743 -0.5    0.357    0.95 mean   qi       
## 2 dfs - b… movies                  0.133  -0.304  0.546    0.95 mean   qi       
## 3 dfs - b… birdstrik…             -0.0743 -0.214  0.0714   0.5  mean   qi       
## 4 dfs - b… movies                  0.133  -0.0333 0.279    0.5  mean   qi

Differences in oracle:

oracle_differences$retrieve_value <- retrieve_value_predictive_data  %>%
    group_by(oracle, dataset, .draw) %>%
    summarize(accuracy = weighted.mean(.prediction)) %>%
    compare_levels(accuracy, by = oracle) %>%
    rename(difference_in_accuracy = accuracy)
## `summarise()` regrouping output by 'oracle', 'dataset' (override with `.groups` argument)
oracle_differences$retrieve_value$metric = "2. Retrieve Value"

oracle_differences$retrieve_value %>%
      ggplot(aes(x = difference_in_accuracy, y = metric, fill = dataset, alpha = 0.5)) +
      xlab(paste0("Expected Difference in Accuracy (",oracle_differences$retrieve_value[1,'oracle'],")")) + 
      ylab("Task")+
      stat_halfeye(.width = c(.95, .5)) +
      geom_vline(xintercept = 0, linetype = "longdash") +
      theme_minimal() +
     facet_grid(. ~ dataset)

We can double-check the boundaries of the credible intervals to be sure whether or not the interval contains zero.

oracle_differences$retrieve_value %>% mean_qi(difference_in_accuracy, .width = c(.95, .5))
## # A tibble: 4 x 8
## # Groups:   oracle [1]
##   oracle      dataset   difference_in_acc… .lower .upper .width .point .interval
##   <chr>       <fct>                  <dbl>  <dbl>  <dbl>  <dbl> <chr>  <chr>    
## 1 dziban - c… birdstri…           -0.0727  -0.5   0.357    0.95 mean   qi       
## 2 dziban - c… movies               0.00309 -0.429 0.417    0.95 mean   qi       
## 3 dziban - c… birdstri…           -0.0727  -0.214 0.0714   0.5  mean   qi       
## 4 dziban - c… movies               0.00309 -0.167 0.15     0.5  mean   qi

Summary Plots

Putting the all of the plots for search algorithm and oracle differences on the same plot:

combined_search_differences <- rbind(search_differences$find_extremum, search_differences$retrieve_value)
search_differences_plot <- combined_search_differences %>%
      ggplot(aes(x = difference_in_accuracy, y = metric, fill = dataset, alpha = 0.5)) +
      xlab(paste0("Expected Difference in Accuracy (",combined_search_differences[1,'search'],")")) + 
      ylab("Task")+
      stat_halfeye(.width = c(.95, .5)) +
      geom_vline(xintercept = 0, linetype = "longdash") +
      theme_minimal() +
     facet_grid(. ~ dataset)

search_differences_plot

search_intervals <- combined_search_differences %>% group_by(search, dataset, metric) %>% mean_qi(difference_in_accuracy, .width = c(.95, .5))
search_intervals
## # A tibble: 8 x 9
## # Groups:   search, dataset [2]
##   search  dataset metric difference_in_a…  .lower .upper .width .point .interval
##   <chr>   <fct>   <chr>             <dbl>   <dbl>  <dbl>  <dbl> <chr>  <chr>    
## 1 dfs - … birdst… 1. Fi…         -0.0648  -0.5    0.357    0.95 mean   qi       
## 2 dfs - … birdst… 2. Re…         -0.0743  -0.5    0.357    0.95 mean   qi       
## 3 dfs - … movies  1. Fi…          0.00237 -0.429  0.408    0.95 mean   qi       
## 4 dfs - … movies  2. Re…          0.133   -0.304  0.546    0.95 mean   qi       
## 5 dfs - … birdst… 1. Fi…         -0.0648  -0.214  0.0714   0.5  mean   qi       
## 6 dfs - … birdst… 2. Re…         -0.0743  -0.214  0.0714   0.5  mean   qi       
## 7 dfs - … movies  1. Fi…          0.00237 -0.121  0.146    0.5  mean   qi       
## 8 dfs - … movies  2. Re…          0.133   -0.0333 0.279    0.5  mean   qi

Putting the all of the plots for oracle differences on the same plot:

combined_oracle_differences <- rbind(oracle_differences$find_extremum, oracle_differences$retrieve_value)
oracle_differences_plot <- combined_oracle_differences %>%
      ggplot(aes(x = difference_in_accuracy, y = metric, fill = dataset, alpha = 0.5)) +
      xlab(paste0("Expected Difference in Accuracy (",combined_oracle_differences[1,'oracle'],")")) + 
      ylab("Task")+
      stat_halfeye(.width = c(.95, .5)) +
      geom_vline(xintercept = 0, linetype = "longdash") +
      theme_minimal() +
     facet_grid(. ~ dataset)

oracle_differences_plot

oracle_intervals <- combined_oracle_differences %>% group_by(oracle, dataset, metric) %>% mean_qi(difference_in_accuracy, .width = c(.95, .5))
oracle_intervals
## # A tibble: 8 x 9
## # Groups:   oracle, dataset [2]
##   oracle  dataset metric difference_in_a…  .lower .upper .width .point .interval
##   <chr>   <fct>   <chr>             <dbl>   <dbl>  <dbl>  <dbl> <chr>  <chr>    
## 1 dziban… birdst… 1. Fi…          0.0738  -0.357  0.5      0.95 mean   qi       
## 2 dziban… birdst… 2. Re…         -0.0727  -0.5    0.357    0.95 mean   qi       
## 3 dziban… movies  1. Fi…          0.00782 -0.429  0.412    0.95 mean   qi       
## 4 dziban… movies  2. Re…          0.00309 -0.429  0.417    0.95 mean   qi       
## 5 dziban… birdst… 1. Fi…          0.0738  -0.0714 0.214    0.5  mean   qi       
## 6 dziban… birdst… 2. Re…         -0.0727  -0.214  0.0714   0.5  mean   qi       
## 7 dziban… movies  1. Fi…          0.00782 -0.121  0.146    0.5  mean   qi       
## 8 dziban… movies  2. Re…          0.00309 -0.167  0.15     0.5  mean   qi